0: Preparation

Defining the input and output files

# Clean the working environment
rm(list = ls())

knitr::opts_chunk$set(echo = TRUE)

# empirical_species <- "German Shepherd"

empirical_dog_breed <- Sys.getenv("empirical_dog_breed")

empirical_species <- paste(toupper(substring(unlist(strsplit(empirical_dog_breed, "_")), 1, 1)),
                             substring(unlist(strsplit(empirical_dog_breed, "_")), 2),
                             sep = "", collapse = " ")
# empirical_species <- "Labrador Retriever"
# document_title <- paste(empirical_species,"Pipeline Results - 50 N_e bottleneck scenario")

# document_title <- paste(empirical_species,"55 Gen Breed Formation - SNP chip: Post_bottleneck_generation - Mutations introduced - 50 N_e bottleneck scenario")

# document_title <- paste(empirical_species,"40 Gen Breed Formation - SNP chip: Post_bottleneck_generation - Mutations introduced - 50 N_e bottleneck scenario")

N_e_bottleneck <- as.numeric(Sys.getenv("N_e_bottleneck"))
n_simulated_generations_breed_formation <- as.numeric(Sys.getenv("n_simulated_generations_breed_formation"))
reference_population_for_snp_chip <- Sys.getenv("reference_population_for_snp_chip")

document_title <- paste0(empirical_species," ",n_simulated_generations_breed_formation," Gen Breed Formation - SNP chip: ",reference_population_for_snp_chip," - ",N_e_bottleneck," N_e bottleneck scenario")

document_sub <- Sys.getenv("document_sub_title") 


# # 
# # MAF_pruning_used <- TRUE
# MAF_pruning_used <- FALSE
# 
# N_e <- 340
# # document_sub <- paste("MAF 0.05 used, but only for H_E-computation, N_e=", N_e)
# # document_sub <- paste("MAF 0.01 used, but only for H_E-computation, N_e=", N_e)
# 
# 
# 
# 
# 
# if (MAF_pruning_used == FALSE) {
#   document_sub <- paste("No MAF-based pruning used, N_e =", N_e)
# 
# 
# } else {
#   document_sub <- paste("MAF-based pruning used, N_e =", N_e)
# }

############################################ 
# Parameters used for displaying the result
############################################ 
ROH_frequency_decimals <- 1
H_e_values_decimals <- 3
F_ROH_values_decimals <- 3
Window_lengths_decimals <- 2

####################################  
# Defining the input files
#################################### 

Selection_strength_test_dir <- Sys.getenv("Selection_strength_test_dir")

Sweep_test_dir <-  Sys.getenv("Sweep_test_dir")

############### 
## Empirical ###
###############

### ROH hotspots ###
Empirical_data_ROH_hotspots_dir  <- Sys.getenv("Empirical_data_ROH_hotspots_dir")
Empirical_data_autosome_ROH_freq_dir <- Sys.getenv("Empirical_data_autosome_ROH_freq_dir")
### Inbreeding coefficient ###

Empirical_data_F_ROH_dir  <- Sys.getenv("Empirical_data_F_ROH_dir")

### Expected Heterozygosity distribution ###
Empirical_data_H_e_dir <- Sys.getenv("Empirical_data_H_e_dir")

############### 
## Simulated ###
###############

### Causative variant ###
variant_freq_plots_dir <- Sys.getenv("variant_freq_plots_dir")
variant_position_dir <- Sys.getenv("variant_position_dir")
pruned_replicates_count_dir <- Sys.getenv("pruned_replicates_count_dir")

Selection_causative_variant_windows_dir <- Sys.getenv("Selection_causative_variant_windows_dir")
causative_variant_windows_H_e_dir <- Sys.getenv("causative_variant_windows_H_e_dir")

### ROH hotspots ###
Neutral_model_ROH_hotspots_dir <- Sys.getenv("Neutral_model_ROH_hotspots_dir")
Neutral_model_autosome_ROH_freq_dir <- Sys.getenv("Neutral_model_autosome_ROH_freq_dir")

Selection_model_ROH_hotspots_dir  <- Sys.getenv("Selection_model_ROH_hotspots_dir")
Selection_model_autosome_ROH_freq_dir <- Sys.getenv("Selection_model_autosome_ROH_freq_dir")

### Inbreeding coefficient ###
Neutral_model_F_ROH_dir  <- Sys.getenv("Neutral_model_F_ROH_dir")
Selection_model_F_ROH_dir  <- Sys.getenv("Selection_model_F_ROH_dir")

### Expected Heterozygosity distribution ###
Neutral_model_H_e_dir <- Sys.getenv("Neutral_model_H_e_dir")
Selection_model_H_e_dir <- Sys.getenv("Selection_model_H_e_dir")


histogram_line_sizes <- 3
empirical_data_color <- "darkgreen"
neutral_model_color <- "blue" 
selection_model_color <- "purple"

output_dir <- Sys.getenv("Pipeline_results_output_dir") 


# Sys.getenv()  

# # Set the working directory for notebook chunks
# knitr::opts_knit$set(root.dir = output_dir_sweep_test)

Loading libraries

if (!require(knitr)) install.packages("knitr", dependencies = TRUE)
## Loading required package: knitr
## Warning: package 'knitr' was built under R version 4.3.2
library(knitr)

if (!require(ggplot2)) install.packages("ggplot2", dependencies = TRUE)
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.3.1
library(ggplot2)

if (!require(scatterplot3d)) install.packages("scatterplot3d", dependencies = TRUE)
## Loading required package: scatterplot3d
library(scatterplot3d) # For generating the 3d plots 

if (!require(RColorBrewer)) install.packages("RColorBrewer", dependencies = TRUE)
## Loading required package: RColorBrewer
library(RColorBrewer) # For generating a color palette

Latex formatting function

Causative variant

Variant fixation

Fixation probability

Fixation time

# Function to determine the number of rows until fixation is reached
time_to_fixation <- function(data, threshold = 0.9) {
  # Find the index of the first row where the allele frequency is 0.9 or higher
  fixation_index <- which(data$allele_frequency >= threshold)[1]
  
  # Return the number of rows until fixation is reached
  return(fixation_index - 1)
}

Summary - Variant fixation

## [1] "/home/jonathan/results/Pipeline_results_MAF_0_01/Causative_variant_Fixation_summary.txt"
Selection_coefficient Fixation_probability Avg_Fixation_time Min_fixation_time Max_fixation_time
s=0.1 34.5 53 37 72
s=0.2 83.3 34 21 50
s=0.3 83.3 22 11 33
s=0.4 76.9 16 11 20
s=0.5 66.7 12 9 17
s=0.6 69.0 10 7 12
s=0.7 66.7 8 6 10
s=0.8 71.4 7 5 8

Causative variant windows

Variant position

Window lengths

Causative variant windows

Summary - Causative variant windows

## [1] "/home/jonathan/results/Pipeline_results_MAF_0_01/Causative_variant_windows_summary.txt"
Selection_coefficient Avg_Length_Mb Avg_window_freq Min_freq Max_freq Avg_freq_variant_100k_window
2 s=0.2 1.46 82.3 35.5 100.0 84.3
3 s=0.3 2.25 79.1 18.1 100.0 80.3
7 s=0.7 2.45 74.1 23.1 100.0 75.2
1 s=0.1 2.49 59.0 17.9 99.1 61.0
6 s=0.6 3.02 80.7 17.9 100.0 82.0
4 s=0.4 3.04 81.7 8.5 100.0 82.3
5 s=0.5 3.12 67.0 17.9 100.0 68.1
8 s=0.8 4.45 82.4 12.9 100.0 83.1

Standard Error Confidence interval function

Confidence level <=> konfidensgrad

# Function to calculate standard error and its confidence interval
standard_error_confidence_interval_fun <- function(observed_data, confidence_level = 0.95) {
  # if ( nrow(observed_data) > 1 ) {
  
  if ( length(observed_data) > 1 ) {
      
  # Calculate standard error
  n <- length(observed_data)
  standard_deviation <- sd(observed_data)
  standard_error <- standard_deviation / sqrt(n - 1)
  
  # Calculate confidence interval based on standard error

  alpha <- (1 - confidence_level) / 2
  margin_of_error <- qnorm(1 - alpha) * standard_error
  mean_estimate <- mean(observed_data)
  # Calculate the percentiles for the confidence interval
  confidence_interval_lower_bound <- mean_estimate - margin_of_error # 2.5th percentile (2σ)
  confidence_interval_upper_bound <- mean_estimate + margin_of_error # 97.5th percentile (2σ)
  
  # Return confidence interval
  return(c(confidence_interval_lower_bound, confidence_interval_upper_bound))

    
  } else {
    return(c(NA,NA)) 
    }
  
  
} 





# # Function to calculate bootstrap confidence intervals
# standard_error_confidence_interval_fun <- function(observed_data, n_bootstraps = 1000, confidence_level = 0.95) {
#   
#   # Function to calculate the mean for each bootstrap sample
#   compute_bootstrap_mean_fun <- function(observed_data_urn) {
#     bootstrap_dataset <- sample(observed_data_urn, replace = TRUE)
#     bootstrap_estimate <- mean(bootstrap_dataset)
#     return(bootstrap_estimate)
#   }
#   
#   # Perform bootstrap resampling
#   bootstrap_point_estimates <- replicate(n_bootstraps, compute_bootstrap_mean_fun(observed_data))
#   
#   # Calculate the percentiles for the confidence interval
#   alpha <- (1 - confidence_level) / 2
#   confidence_interval_lower_bound <- quantile(bootstrap_point_estimates, alpha) # 2.5th percentile
#   confidence_interval_upper_bound <- quantile(bootstrap_point_estimates, 1 - alpha) # 97.5th percentile
#   
#   # Return the confidence interval
#   return(c(confidence_interval_lower_bound, confidence_interval_upper_bound))
# }

1: ROH-Frequency

1.1 : Autosome ROH-frequencies

1.1.1 : Empirical - ROH frequency

1.1.2: Neutral Model - ROH frequency

1.1.3: Selection Model

1.1.4 Summary - ROH-hotspot threshold

## ROH-hotspot selection testing results://n
## [1] "/home/jonathan/results/Pipeline_results_MAF_0_01/ROH-hotspot_threshold_comparison.txt"
Model Avg_ROH_hotspot_threshold
Neutral 52.0
Empirical 55.4
s=0.1 64.6
s=0.5 79.2
s=0.3 81.8
s=0.2 82.8
s=0.7 85.8
s=0.8 87.1
s=0.4 87.5
s=0.6 88.8

1.2 ROH-hotspots - ROH Frequency and Length

2: Inbreeding coefficient

2.1 Empirical data

## Overall Average Avg_F_ROH for  labrador_retriever : 0.2333818 //n

2.2 Neutral Model

## Point estimate F_ROH across all 20 simulations: 0.2254513 //n
## [1] "Bootstrap 95% Confidence Interval: [0.210695324058223, 0.24020718182413]"

2.3 Selection Model

## Point estimate F_ROH across all 20 simulations for  selection_model_s01_chr3 : 0.2324361 //n[1] "Bootstrap 95% Confidence Interval: [0.219446141689648, 0.245425997133882]"

## Point estimate F_ROH across all 20 simulations for  selection_model_s02_chr3 : 0.2602741 //n[1] "Bootstrap 95% Confidence Interval: [0.247307214840647, 0.273240978100529]"

## Point estimate F_ROH across all 20 simulations for  selection_model_s03_chr3 : 0.293561 //n[1] "Bootstrap 95% Confidence Interval: [0.267891390833455, 0.319230526813604]"

## Point estimate F_ROH across all 20 simulations for  selection_model_s04_chr3 : 0.3024522 //n[1] "Bootstrap 95% Confidence Interval: [0.282208055417251, 0.322696370465102]"

## Point estimate F_ROH across all 20 simulations for  selection_model_s05_chr3 : 0.3123284 //n[1] "Bootstrap 95% Confidence Interval: [0.291190019682684, 0.333466850905551]"

## Point estimate F_ROH across all 20 simulations for  selection_model_s06_chr3 : 0.3255663 //n[1] "Bootstrap 95% Confidence Interval: [0.305588675205435, 0.345543948323977]"

## Point estimate F_ROH across all 20 simulations for  selection_model_s07_chr3 : 0.3400056 //n[1] "Bootstrap 95% Confidence Interval: [0.309288678924456, 0.370722596369662]"

## Point estimate F_ROH across all 20 simulations for  selection_model_s08_chr3 : 0.366789 //n[1] "Bootstrap 95% Confidence Interval: [0.332464701241699, 0.401113258758301]"

2.4 F_ROH summary

## [1] "/home/jonathan/results/Pipeline_results_MAF_0_01/F_ROH_comparison.txt"
Model F_ROH Lower_CI Upper_CI
Neutral 0.225 0.211 0.240
s=0.1 0.232 0.219 0.245
Empirical 0.233 NA NA
s=0.2 0.260 0.247 0.273
s=0.3 0.294 0.268 0.319
s=0.4 0.302 0.282 0.323
s=0.5 0.312 0.291 0.333
s=0.6 0.326 0.306 0.346
s=0.7 0.340 0.309 0.371
s=0.8 0.367 0.332 0.401

3: Expected Heterozygosity

3.1 Empirical data

3.2 Neutral Model

3.3 Selection Model

3.4 Causative Variant Window

## Point estimate H_e across all 20 simulations for  s01_chr3 : 0.2059108 //n[1] "Bootstrap 95% Confidence Interval: [0.148919377141177, 0.262902263176715]"
## Point estimate H_e across all 20 simulations for  s02_chr3 : 0.1155137 //n[1] "Bootstrap 95% Confidence Interval: [0.0744645686963791, 0.156562900986847]"
## Point estimate H_e across all 20 simulations for  s03_chr3 : 0.1523321 //n[1] "Bootstrap 95% Confidence Interval: [0.0977077932227925, 0.206956327399316]"
## Point estimate H_e across all 20 simulations for  s04_chr3 : 0.163488 //n[1] "Bootstrap 95% Confidence Interval: [0.0942804514523995, 0.232695484316527]"
## Point estimate H_e across all 20 simulations for  s05_chr3 : 0.2557699 //n[1] "Bootstrap 95% Confidence Interval: [0.17079472142343, 0.340745149224726]"
## Point estimate H_e across all 20 simulations for  s06_chr3 : 0.1795539 //n[1] "Bootstrap 95% Confidence Interval: [0.103228297762592, 0.255879481309857]"
## Point estimate H_e across all 20 simulations for  s07_chr3 : 0.2447062 //n[1] "Bootstrap 95% Confidence Interval: [0.169341380672388, 0.320070958740416]"
## Point estimate H_e across all 20 simulations for  s08_chr3 : 0.1602423 //n[1] "Bootstrap 95% Confidence Interval: [0.0961332827398887, 0.224351282059818]"

3.4 Genomewide Average H_e

Model H_e Lower_CI Upper_CI
Empirical 0.316 NA NA
s=0.4 0.319 0.311 0.327
s=0.8 0.319 0.309 0.329
s=0.3 0.321 0.314 0.327
s=0.6 0.321 0.311 0.332
s=0.7 0.323 0.313 0.333
s=0.2 0.326 0.322 0.330
s=0.5 0.326 0.320 0.332
s=0.1 0.332 0.328 0.337
Neutral 0.333 0.329 0.338

3.5 Genomewide 5th percentile comparison - Expected Heterozygosity Summary

## [1] "/home/jonathan/results/Pipeline_results_MAF_0_01/Expected_Heterozygosity_Summary.txt"
Model H_e_5th_percentile Lower_CI Upper_CI
s=0.8 0.121 0.105 0.137
s=0.4 0.129 0.113 0.145
s=0.7 0.131 0.112 0.149
s=0.6 0.133 0.119 0.146
s=0.5 0.136 0.124 0.149
s=0.3 0.139 0.124 0.154
s=0.2 0.151 0.141 0.161
Empirical 0.151 NA NA
s=0.1 0.167 0.157 0.176
Neutral 0.174 0.165 0.183

4: Summary

4.0 General comparison

4.0.1 ROH-hotspot threshold comparison

## 
##  ROH-hotspot threshold comparison between the different datasets
Model Avg_ROH_hotspot_threshold
Neutral 52.0
Empirical 55.4
s=0.1 64.6
s=0.5 79.2
s=0.3 81.8
s=0.2 82.8
s=0.7 85.8
s=0.8 87.1
s=0.4 87.5
s=0.6 88.8

4.0.2 F_ROH comparison

## Models where empirical F_ROH is within CI:
## [1] "s=0.1"   "Neutral"
## 
## Models where empirical F_ROH is outside CI:
## [1] "s=0.2" "s=0.3" "s=0.4" "s=0.5" "s=0.6" "s=0.7" "s=0.8"
## png 
##   2
Model F_ROH Lower_CI Upper_CI
Neutral 0.225 0.211 0.240
s=0.1 0.232 0.219 0.245
Empirical 0.233 NA NA
s=0.2 0.260 0.247 0.273
s=0.3 0.294 0.268 0.319
s=0.4 0.302 0.282 0.323
s=0.5 0.312 0.291 0.333
s=0.6 0.326 0.306 0.346
s=0.7 0.340 0.309 0.371
s=0.8 0.367 0.332 0.401

4.1: Hotspot comparison

4.1.1: Selection test (Sweep test)

## [1] "Selection test results"
## [1] "ROH-hotspot windows with an mean H_e Value lower or equal to the lower confidence interval of the fifth percentile of the neutral model are classified as being under selection"
## [1] "5th percentile of the neutral model is: 0.1649792"
Name Under_selection Window_based_Average_H_e
Hotspot_chr1_window_1 Yes 0.096
Hotspot_chr28_window_1 Yes 0.118
Hotspot_chr24_window_1 No 0.186
Hotspot_chr13_window_1 No 0.190
Hotspot_chr11_window_2 No 0.205
Hotspot_chr15_window_1 No 0.223
Hotspot_chr11_window_1 No 0.225
Hotspot_chr6_window_1 No 0.232
Hotspot_chr8_window_1 No 0.260
## [1] "/home/jonathan/results/Pipeline_results_MAF_0_01/ROH_hotspots_Selection_testing_neutral_model_H_E_threshold_0.165.csv.txt"
## [1] "ROH-hotspots under selection:"
Name Under_selection Window_based_Average_H_e
Hotspot_chr1_window_1 Yes 0.096
Hotspot_chr28_window_1 Yes 0.118

4.1.2: Selection Strength Test (Sweep test)

4.1.1.1 Extracting Causative windows under selection

## [1] "/home/jonathan/results/Pipeline_results_MAF_0_01/Causative_windows_under_selection.txt"
Model H_e Lower_CI Upper_CI Under_Selection
s=0.2 0.116 0.074 0.157 Yes
s=0.3 0.152 0.098 0.207 Yes
s=0.8 0.160 0.096 0.224 Yes
s=0.4 0.163 0.094 0.233 Yes
Neutral 0.174 0.165 0.183 No
s=0.6 0.180 0.103 0.256 No
s=0.1 0.206 0.149 0.263 No
s=0.7 0.245 0.169 0.320 No
s=0.5 0.256 0.171 0.341 No

4.1.1.1 Extracting Hotspots under selection

*** Behöver bootstrap av 5th percentiles från neutral model

Hotspot - Causative Window - Comparison

Model Lengths_Mb ROH_Freq H_e Under_selection
s=0.8 4.45 82.4 0.160 Yes
s=0.7 2.45 74.1 0.245 No
s=0.6 3.02 80.7 0.180 No
s=0.5 3.12 67.0 0.256 No
s=0.4 3.04 81.7 0.163 Yes
s=0.3 2.25 79.1 0.152 Yes
s=0.2 1.46 82.3 0.116 Yes
s=0.1 2.49 59.0 0.206 No
Hotspot_chr28_window_1 1.40 75.0 0.118 Yes
Hotspot_chr1_window_1 0.20 61.9 0.096 Yes
# plot_title <- paste("Length And Average ROH % Comparison - ROH Hotspots & Causative Windows")
plot_title <- paste("ROH Hotspot(s) & Causative Windows comparison")


# Modify the labels by removing "Hotspot_" prefix and "_window" suffix from hotspot models
Hotspots_and_Causative_windows_comparison_sorted$Label <- ifelse(
  grepl("Hotspot", Hotspots_and_Causative_windows_comparison_sorted$Model), 
  gsub("Hotspot_|_window", "", Hotspots_and_Causative_windows_comparison_sorted$Model), 
  Hotspots_and_Causative_windows_comparison_sorted$Model
)

# Removing the "s=" part from the selection coefficients for the plot displayal
Hotspots_and_Causative_windows_comparison_sorted$Label <- gsub("^s=", "", Hotspots_and_Causative_windows_comparison_sorted$Label)



# Create a new column for identifying the type
Hotspots_and_Causative_windows_comparison_sorted$Type <- ifelse(
  grepl("Hotspot", Hotspots_and_Causative_windows_comparison_sorted$Model), 
  "Hotspot", 
  "Selection Coefficient"
)


# # Create the 3D scatter plot
# s3d <- scatterplot3d(
#   Hotspots_and_Causative_windows_comparison_sorted$Lengths_Mb,
#   Hotspots_and_Causative_windows_comparison_sorted$ROH_Freq,
#   Hotspots_and_Causative_windows_comparison_sorted$H_e,
#   color = ifelse(Hotspots_and_Causative_windows_comparison_sorted$Type == "Hotspot", "red", "blue"),
#   pch = 19, # Solid circle
#   xlab = "Lengths",
#   ylab = "ROH Frequency",
#   zlab = "H_e",
#   main = plot_title
# )
# 
# # Add labels to the points
# s3d.coords <- s3d$xyz.convert(
#   Hotspots_and_Causative_windows_comparison_sorted$Lengths_Mb,
#   Hotspots_and_Causative_windows_comparison_sorted$ROH_Freq,
#   Hotspots_and_Causative_windows_comparison_sorted$H_e
# )
# text(s3d.coords$x, s3d.coords$y, 
#      labels = Hotspots_and_Causative_windows_comparison_sorted$Label, 
#      pos = 3, cex = 0.5) # pos=3 means above


# Create the 3D scatter plot
s3d <- scatterplot3d(
  Hotspots_and_Causative_windows_comparison_sorted$ROH_Freq,
  Hotspots_and_Causative_windows_comparison_sorted$Lengths_Mb,
  Hotspots_and_Causative_windows_comparison_sorted$H_e,
  color = ifelse(Hotspots_and_Causative_windows_comparison_sorted$Type == "Hotspot", "red", "blue"),
  pch = 19, # Solid circle
  xlab = "Avg ROH-frequency (%)",
  ylab = "Length (Mb)",
  zlab = "Avg H_e",
  main = plot_title
)

# )

# Add labels to the points
s3d.coords <- s3d$xyz.convert(
  Hotspots_and_Causative_windows_comparison_sorted$ROH_Freq,
  Hotspots_and_Causative_windows_comparison_sorted$Lengths_Mb,
  Hotspots_and_Causative_windows_comparison_sorted$H_e
)
text(s3d.coords$x, s3d.coords$y, 
     labels = Hotspots_and_Causative_windows_comparison_sorted$Label, 
     pos = 3, cex = 0.5) # pos=3 means above

# Test med skuggor

setwd(output_dir)

# plot_title <- paste("Length And Average ROH % Comparison - ROH Hotspots & Causative Windows")
plot_title <- paste("ROH Hotspot(s) & Causative Windows comparison")

# Modify the labels by removing "Hotspot_" prefix and "_window" suffix from hotspot models
Hotspots_and_Causative_windows_comparison_sorted$Label <- ifelse(
  grepl("Hotspot", Hotspots_and_Causative_windows_comparison_sorted$Model), 
  gsub("Hotspot_|_window", "", Hotspots_and_Causative_windows_comparison_sorted$Model), 
  Hotspots_and_Causative_windows_comparison_sorted$Model
)

# Removing the "s=" part from the selection coefficients for the plot display
Hotspots_and_Causative_windows_comparison_sorted$Label <- gsub("^s=", "", Hotspots_and_Causative_windows_comparison_sorted$Label)

# Create a new column for identifying the type
Hotspots_and_Causative_windows_comparison_sorted$Type <- ifelse(
  grepl("Hotspot", Hotspots_and_Causative_windows_comparison_sorted$Model), 
  "Hotspot", 
  "Selection Coefficient"
)

# Generate a color palette for the hotspots
hotspot_models <- unique(Hotspots_and_Causative_windows_comparison_sorted$Model[Hotspots_and_Causative_windows_comparison_sorted$Type == "Hotspot"])
num_hotspots <- length(hotspot_models)

# Choose the color palette based on the number of hotspots
if (num_hotspots == 2) {
  color_palette_name <- "Set2"
} else {
  color_palette_name <- "Set3"
}

# Get the colors for the hotspots
hotspot_colors <- setNames(brewer.pal(n = num_hotspots, name = color_palette_name), hotspot_models)
## Warning in brewer.pal(n = num_hotspots, name = color_palette_name): minimal value for n is 3, returning requested palette with 3 different levels
# Assign colors to each point
Hotspots_and_Causative_windows_comparison_sorted$Color <- ifelse(
  Hotspots_and_Causative_windows_comparison_sorted$Type == "Hotspot", 
  hotspot_colors[Hotspots_and_Causative_windows_comparison_sorted$Model], 
  "blue"
)

x_value <- Hotspots_and_Causative_windows_comparison_sorted$ROH_Freq
x_axis_label <- "Avg ROH-frequency (%)"
y_value <- Hotspots_and_Causative_windows_comparison_sorted$Lengths_Mb
y_axis_label <- "Length (Mb)"
z_value <- Hotspots_and_Causative_windows_comparison_sorted$H_e
z_axis_label <- "Avg H_e"


# x_value <- Hotspots_and_Causative_windows_comparison_sorted$Lengths_Mb
# x_axis_label <- "Length (Mb)"
# y_value <- Hotspots_and_Causative_windows_comparison_sorted$H_e
# y_axis_label <- "Avg H_e"
# z_value <- Hotspots_and_Causative_windows_comparison_sorted$ROH_Freq
# z_axis_label <- "Avg ROH-frequency (%)"


x_value <- Hotspots_and_Causative_windows_comparison_sorted$ROH_Freq
x_axis_label <- "Avg ROH-frequency (%)"
y_value <- Hotspots_and_Causative_windows_comparison_sorted$H_e
y_axis_label <- "Avg H_e"
z_value <- Hotspots_and_Causative_windows_comparison_sorted$Lengths_Mb
z_axis_label <- "Length (Mb)"

# x_value <- Hotspots_and_Causative_windows_comparison_sorted$H_e
# x_axis_label <- "Avg H_e"
# y_value <- Hotspots_and_Causative_windows_comparison_sorted$ROH_Freq
# y_axis_label <- "Avg ROH-frequency (%)"
# z_value <- Hotspots_and_Causative_windows_comparison_sorted$Lengths_Mb
# z_axis_label <- "Length (Mb)"

# # Create and save the 3D scatter plot as a PNG file
# png(filename = "3dplot_Hotspot_Causative_Window_Comparison.png", width = 1920, height = 1080, res = 300)
# png(filename = "3dplot_Hotspot_Causative_Window_Comparison.png",width = 800, height = 600, res = 300)


# Create the 3D scatter plot
s3d <- scatterplot3d(
  x_value  ,
  y_value,
  z_value,
  color = Hotspots_and_Causative_windows_comparison_sorted$Color,
  pch = 19, # Solid circle
  xlab = x_axis_label,
  ylab = y_axis_label,
  zlab = z_axis_label,
  main = plot_title
)

# Add shadows on the x-y plane (z = 0)
s3d$points3d(
  x_value,
  y_value,
  rep(0, nrow(Hotspots_and_Causative_windows_comparison_sorted)),  # Set z to 0 for shadow
  col = adjustcolor(Hotspots_and_Causative_windows_comparison_sorted$Color, alpha.f = 0.3), # Semi-transparent shadows
  pch = 19
)

# Convert coordinates for adding labels
s3d.coords <- s3d$xyz.convert(
  x_value,
  y_value,
  z_value
)

# Add labels to the original points
text(s3d.coords$x, s3d.coords$y, 
     labels = Hotspots_and_Causative_windows_comparison_sorted$Label, 
     pos = 3, cex = 0.5) # pos=3 means above

# # Close the graphics device
# dev.off()

Fixar överlappande labels

# # Increase plot size
# par(mar = c(5, 5, 5, 5)) # Adjust margins if needed
# 
# s3d <- scatterplot3d(
#   # x = Hotspots_and_Causative_windows_comparison_sorted$ROH_Freq,
#   # y = Hotspots_and_Causative_windows_comparison_sorted$Lengths_Mb,
#   # z = Hotspots_and_Causative_windows_comparison_sorted$H_e,
#   
#   # x = Hotspots_and_Causative_windows_comparison_sorted$H_e,
#   # y = Hotspots_and_Causative_windows_comparison_sorted$Lengths_Mb,
#   # z = Hotspots_and_Causative_windows_comparison_sorted$ROH_Freq ,
#   
#   # x = Hotspots_and_Causative_windows_comparison_sorted$Lengths_Mb ,
#   # y = Hotspots_and_Causative_windows_comparison_sorted$H_e,
#   # z = Hotspots_and_Causative_windows_comparison_sorted$ROH_Freq ,
#   
#   x = Hotspots_and_Causative_windows_comparison_sorted$ROH_Freq,
#   y = Hotspots_and_Causative_windows_comparison_sorted$H_e,
#   z = Hotspots_and_Causative_windows_comparison_sorted$Lengths_Mb  ,
# 
#   
#   color = ifelse(Hotspots_and_Causative_windows_comparison_sorted$Type == "Hotspot", "red", "blue"),
#   pch = 19, 
#   # xlab = "Avg ROH-frequency (%)",
#   # ylab = "Length (Mb)",
#   # zlab = "H_e",
#   
#   # xlab = "Length (Mb) ",
#   # ylab = "H_e",
#   # zlab = "Avg ROH-frequency (%)",
#   
#   # xlab = "Length (Mb) ",
#   # ylab = "H_e",
#   # zlab = "Avg ROH-frequency (%)",
#   
#   xlab = "Avg ROH-frequency (%)",
#   ylab = "H_e",
#   zlab = "Length (Mb)",
# 
# 
#   
#   main = plot_title
# )
# # # Add labels to the points
# # s3d.coords <- s3d$xyz.convert(
# #   Hotspots_and_Causative_windows_comparison_sorted$ROH_Freq,
# #   Hotspots_and_Causative_windows_comparison_sorted$Lengths_Mb,
# #   Hotspots_and_Causative_windows_comparison_sorted$H_e
# # )
# 
# 
# # Add labels to the points
# s3d.coords <- s3d$xyz.convert(
#   Hotspots_and_Causative_windows_comparison_sorted$ROH_Freq,
#   Hotspots_and_Causative_windows_comparison_sorted$H_e,
#   Hotspots_and_Causative_windows_comparison_sorted$Lengths_Mb
# )
# 
# # 
# # # Add jitter to the label positions
# # text(s3d.coords$x + runif(nrow(Hotspots_and_Causative_windows_comparison_sorted), -0.04, 0.04),
# #      s3d.coords$y + runif(nrow(Hotspots_and_Causative_windows_comparison_sorted), -0.04, 0.04),
# #      labels = Hotspots_and_Causative_windows_comparison_sorted$Label,
# #      pos = 3, cex = 0.5)
# # 
# # 
# text(s3d.coords$x, s3d.coords$y,
#      labels = Hotspots_and_Causative_windows_comparison_sorted$Label,
#      pos = 3, cex = 0.5) # pos=3 means above

jittering on all axis

# setwd(output_dir)
# 
# # Identify indices for selection and non-selection points based on labels starting with '0'
# selection_indices <- grepl("^0", Hotspots_and_Causative_windows_comparison_sorted$Label)
# non_selection_indices <- !selection_indices
# 
# # windows(width = 1920 / 96, height = 1080 / 96)  # 96 DPI is default
# # # # Create and save the 3D scatter plot as a PNG file
# # png(filename = "3dplot_Hotspot_Causative_Window_Comparison.png", width = 1920, height = 1080)
# 
# # Create the 3D scatter plot with both selection and non-selection points
# s3d <- scatterplot3d(
#   Hotspots_and_Causative_windows_comparison_sorted$ROH_Freq,
#   Hotspots_and_Causative_windows_comparison_sorted$Lengths_Mb,
#   Hotspots_and_Causative_windows_comparison_sorted$H_e,
#   color = ifelse(selection_indices, "blue", "red"), # Blue for selection, Red for hotspots
#   pch = 19, # Solid circle
#   xlab = "Avg ROH-frequency (%)",
#   ylab = "Length (Mb)",
#   zlab = "H_e",
#   main = plot_title
# )
# # Convert 3D coordinates for labeling
# s3d.coords <- s3d$xyz.convert(
#   Hotspots_and_Causative_windows_comparison_sorted$ROH_Freq,
#   Hotspots_and_Causative_windows_comparison_sorted$Lengths_Mb,
#   Hotspots_and_Causative_windows_comparison_sorted$H_e
# )
# # Subset data for non-selection points
# non_selection_data <- Hotspots_and_Causative_windows_comparison_sorted[non_selection_indices, ]
# # s3d.coords$x[non_selection_indices]
# # 7.888889 6.278889 6.715556 7.655556 6.674444 5.723333 6.143333 5.773333 6.212222
# # custom_jitter_x_axis_hotspots <- c(0,0,-0.05,0,0.1,-0.4,0,0,0)  # Adjust as needed
# custom_jitter_x_axis_hotspots <- c(0,0,0,0,0,0,0,0,0)  # Adjust as needed
# 
# # s3d.coords$y[non_selection_indices]
# #3.7711111 2.4511111 1.9444444 1.5644444 2.5355556 3.1066667 2.6066667 3.1466667 0.8977778
# # custom_jitter_y_axis_hotspots <- c(0,-0.7,-0.65,0,0,-0.3,0,0.1,0)   # Adjust as needed
# custom_jitter_y_axis_hotspots <- c(0,0,0,0,0,0,0,0,0)   # Adjust as needed
# 
# # Add labels for non-selection points without jitter
# text(s3d.coords$x[non_selection_indices]+ custom_jitter_x_axis_hotspots, 
#      s3d.coords$y[non_selection_indices]+ custom_jitter_y_axis_hotspots, 
#      labels = Hotspots_and_Causative_windows_comparison_sorted$Label[non_selection_indices], 
#      pos = 3, cex = 0.7)
# 
# 
# 
# selection_data <- Hotspots_and_Causative_windows_comparison_sorted[selection_indices, ]
# custom_jitter_x_axis_sel_coeff <- c(0.2,0.05,0.025,0,-0.05,0,0)
# # custom_jitter_x_axis_sel_coeff <- c(0,0,0,0,0,0,0) 
# custom_jitter_y_axis_sel_coeff <- c(-0.25,0,-0.57,-0.1,-0.6,0,0)
# # custom_jitter_y_axis_sel_coeff <- c(0,0,0,0,0,0,0) 
# # Apply jitter to selection point labels
# label_jitter <- runif(sum(selection_indices), 0, 0.2) # Small random jitter
# # Add labels for selection points with jitter
# text(s3d.coords$x[selection_indices] + custom_jitter_x_axis_sel_coeff, 
#      s3d.coords$y[selection_indices] + custom_jitter_y_axis_sel_coeff, 
#      labels = Hotspots_and_Causative_windows_comparison_sorted$Label[selection_indices], 
#      pos = 3, cex = 0.7)

# # Close the graphics device
# dev.off()

# kable(selection_data,row.names = FALSE)
# kable(non_selection_data,row.names = FALSE)

#Jitter z-axis

# # Create the 3D scatter plot with both selection and non-selection points
# s3d <- scatterplot3d(
#   Hotspots_and_Causative_windows_comparison_sorted$ROH_Freq,
#   Hotspots_and_Causative_windows_comparison_sorted$Lengths_Mb,
#   Hotspots_and_Causative_windows_comparison_sorted$H_e,
#   color = ifelse(selection_indices, "blue", "red"), # Blue for selection, Red for hotspots
#   pch = 19, # Solid circle
#   xlab = "Avg ROH-frequency (%)",
#   ylab = "Length (Mb)",
#   zlab = "H_e",
#   main = plot_title
# )
# 
# # Convert 3D coordinates for labeling
# s3d.coords <- s3d$xyz.convert(
#   Hotspots_and_Causative_windows_comparison_sorted$ROH_Freq,
#   Hotspots_and_Causative_windows_comparison_sorted$Lengths_Mb,
#   Hotspots_and_Causative_windows_comparison_sorted$H_e
# )
# 
# # Generate small random jitter for the y-axis (simulating z-axis movement)
# y_jitter <- runif(sum(selection_indices), -0.02, 0.02)
# 
# # Adjust y-coordinates for selection labels only
# jittered_y <- s3d.coords$y[selection_indices] + y_jitter
# 
# # Add labels for selection points with y-axis adjustment
# text(s3d.coords$x[selection_indices],
#      jittered_y,
#      labels = Hotspots_and_Causative_windows_comparison_sorted$Label[selection_indices],
#      pos = 3, cex = 0.7)
# # 
# # Add labels for non-selection points without jitter
# text(s3d.coords$x[non_selection_indices],
#      s3d.coords$y[non_selection_indices],
#      labels = Hotspots_and_Causative_windows_comparison_sorted$Label[non_selection_indices],
#      pos = 3, cex = 0.7)
# Sort the data frame based on average fixation time, in descending order
 kable(Hotspots_and_Causative_windows_comparison[order(-Hotspots_and_Causative_windows_comparison$ROH_Freq), ])
Model Lengths_Mb ROH_Freq H_e Under_selection
10 s=0.8 4.45 82.4 0.160 Yes
4 s=0.2 1.46 82.3 0.116 Yes
6 s=0.4 3.04 81.7 0.163 Yes
8 s=0.6 3.02 80.7 0.180 No
5 s=0.3 2.25 79.1 0.152 Yes
2 Hotspot_chr28_window_1 1.40 75.0 0.118 Yes
9 s=0.7 2.45 74.1 0.245 No
7 s=0.5 3.12 67.0 0.256 No
1 Hotspot_chr1_window_1 0.20 61.9 0.096 Yes
3 s=0.1 2.49 59.0 0.206 No
 kable(Hotspots_and_Causative_windows_comparison[order(-Hotspots_and_Causative_windows_comparison$H_e), ])
Model Lengths_Mb ROH_Freq H_e Under_selection
7 s=0.5 3.12 67.0 0.256 No
9 s=0.7 2.45 74.1 0.245 No
3 s=0.1 2.49 59.0 0.206 No
8 s=0.6 3.02 80.7 0.180 No
6 s=0.4 3.04 81.7 0.163 Yes
10 s=0.8 4.45 82.4 0.160 Yes
5 s=0.3 2.25 79.1 0.152 Yes
2 Hotspot_chr28_window_1 1.40 75.0 0.118 Yes
4 s=0.2 1.46 82.3 0.116 Yes
1 Hotspot_chr1_window_1 0.20 61.9 0.096 Yes
# # Load the required packages
# library(rgl)
# 
# # Identify indices for selection and non-selection points based on labels starting with '0'
# selection_indices <- grepl("^0", Hotspots_and_Causative_windows_comparison_sorted$Label)
# non_selection_indices <- !selection_indices
# 
# # Open a new 3D plotting device
# open3d()
# 
# # Plot with rgl
# plot3d(
#   Hotspots_and_Causative_windows_comparison_sorted$ROH_Freq,
#   Hotspots_and_Causative_windows_comparison_sorted$Lengths_Mb,
#   Hotspots_and_Causative_windows_comparison_sorted$H_e,
#   col = ifelse(selection_indices, "blue", "red"), # Blue for selection, Red for hotspots
#   type = 's', # Use points
#   xlab = "Avg ROH-frequency (%)",
#   ylab = "Length (Mb)",
#   zlab = "H_e",
#   main = plot_title
# )
# 
# # Convert 3D coordinates for labeling
# s3d_coords <- cbind(
#   Hotspots_and_Causative_windows_comparison_sorted$ROH_Freq,
#   Hotspots_and_Causative_windows_comparison_sorted$Lengths_Mb,
#   Hotspots_and_Causative_windows_comparison_sorted$H_e
# )
# 
# # Add labels for non-selection points with custom jitter
# text3d(
#   s3d_coords[non_selection_indices, ] + cbind(custom_jitter_x_axis_hotspots, custom_jitter_y_axis_hotspots, 0),
#   texts = Hotspots_and_Causative_windows_comparison_sorted$Label[non_selection_indices],
#   col = "red", # Color of the text
#   cex = 0.7
# )
# 
# # Add labels for selection points with custom jitter
# text3d(
#   s3d_coords[selection_indices, ] + cbind(custom_jitter_x_axis_sel_coeff, custom_jitter_y_axis_sel_coeff, label_jitter),
#   texts = Hotspots_and_Causative_windows_comparison_sorted$Label[selection_indices],
#   col = "blue", # Color of the text
#   cex = 0.7
# )

# # Save the 3D plot as a PNG file
# rgl.postscript("3dplot_Hotspot_Causative_Window_Comparison.png", fmt = "png")

# # Close the 3D device
# rgl.close()

Visualizing and scaling

# Min-max-scaling normalization function, that normalizes a vector of values
min_max_normalization_fun <- function(x) {
  return((x - min(x)) / (max(x) - min(x)))
}
### Min max scaling ###
# Normalize Lengths_Mb
Hotspots_and_Causative_windows_comparison_sorted$Lengths_Mb_Normalized <- min_max_normalization_fun(Hotspots_and_Causative_windows_comparison_sorted$Lengths_Mb)
# Normalize ROH Frequency
Hotspots_and_Causative_windows_comparison_sorted$ROH_Freq_Normalized <- min_max_normalization_fun(Hotspots_and_Causative_windows_comparison_sorted$ROH_Freq)
# Normalize H_e
Hotspots_and_Causative_windows_comparison_sorted$H_e_Normalized <- min_max_normalization_fun(Hotspots_and_Causative_windows_comparison_sorted$H_e)

# Modify the labels by removing "Hotspot_" prefix and "_window" suffix from hotspot models
Hotspots_and_Causative_windows_comparison_sorted$Label <- ifelse(
  grepl("Hotspot", Hotspots_and_Causative_windows_comparison_sorted$Model), 
  gsub("Hotspot_|_window", "", Hotspots_and_Causative_windows_comparison_sorted$Model), 
  Hotspots_and_Causative_windows_comparison_sorted$Model
)

# Create a new column for identifying the type
Hotspots_and_Causative_windows_comparison_sorted$Type <- ifelse(
  grepl("Hotspot", Hotspots_and_Causative_windows_comparison_sorted$Model), 
  "Hotspot", 
  "Selection Coefficient"
)

plot_title <- paste("Normalized ROH Hotspot(s) & Causative Windows comparison")


# Create the 3D scatter plot
s3d <- scatterplot3d(
  Hotspots_and_Causative_windows_comparison_sorted$Lengths_Mb_Normalized,
  Hotspots_and_Causative_windows_comparison_sorted$ROH_Freq_Normalized,
  Hotspots_and_Causative_windows_comparison_sorted$H_e_Normalized,
  color = ifelse(Hotspots_and_Causative_windows_comparison_sorted$Type == "Hotspot", "red", "blue"),
  pch = 19, # Solid circle
  xlab = "Normalized Lengths",
  ylab = "Normalized Mean ROH Frequency",
  zlab = "Normalized H_e",
  main = plot_title
)

# Add labels to the points
s3d.coords <- s3d$xyz.convert(
  Hotspots_and_Causative_windows_comparison_sorted$Lengths_Mb_Normalized,
  Hotspots_and_Causative_windows_comparison_sorted$ROH_Freq_Normalized,
  Hotspots_and_Causative_windows_comparison_sorted$H_e_Normalized
)
text(s3d.coords$x, s3d.coords$y, 
     labels = Hotspots_and_Causative_windows_comparison_sorted$Label, 
     pos = 3, cex = 0.5) # pos=3 means above

5 Visualizing Expected Heterozygoisty

5.1 Bootstrap CI for mean 5th percentile H_e

## TRUE FALSE TRUE TRUE

## Models where empirical H_e is within CI for Hotspot_chr1_window_1 :
## [1] "s=0.2" "s=0.4" "s=0.8"
## 
## Models where empirical H_e is outside CI for Hotspot_chr1_window_1 :
## [1] "s=0.3"
## TRUE TRUE TRUE TRUE

## Models where empirical H_e is within CI for Hotspot_chr28_window_1 :
## [1] "s=0.2" "s=0.3" "s=0.4" "s=0.8"
## 
## Models where empirical H_e is outside CI for Hotspot_chr28_window_1 :
## character(0)
## FALSE TRUE TRUE TRUE

## Models where empirical H_e is within CI for Hotspot_chr24_window_1 :
## [1] "s=0.3" "s=0.4" "s=0.8"
## 
## Models where empirical H_e is outside CI for Hotspot_chr24_window_1 :
## [1] "s=0.2"
## FALSE TRUE TRUE TRUE

## Models where empirical H_e is within CI for Hotspot_chr13_window_1 :
## [1] "s=0.3" "s=0.4" "s=0.8"
## 
## Models where empirical H_e is outside CI for Hotspot_chr13_window_1 :
## [1] "s=0.2"
## FALSE TRUE TRUE TRUE

## Models where empirical H_e is within CI for Hotspot_chr11_window_2 :
## [1] "s=0.3" "s=0.4" "s=0.8"
## 
## Models where empirical H_e is outside CI for Hotspot_chr11_window_2 :
## [1] "s=0.2"
## FALSE FALSE TRUE TRUE

## Models where empirical H_e is within CI for Hotspot_chr15_window_1 :
## [1] "s=0.4" "s=0.8"
## 
## Models where empirical H_e is outside CI for Hotspot_chr15_window_1 :
## [1] "s=0.2" "s=0.3"
## FALSE FALSE TRUE FALSE

## Models where empirical H_e is within CI for Hotspot_chr11_window_1 :
## [1] "s=0.4"
## 
## Models where empirical H_e is outside CI for Hotspot_chr11_window_1 :
## [1] "s=0.2" "s=0.3" "s=0.8"
## FALSE FALSE TRUE FALSE

## Models where empirical H_e is within CI for Hotspot_chr6_window_1 :
## [1] "s=0.4"
## 
## Models where empirical H_e is outside CI for Hotspot_chr6_window_1 :
## [1] "s=0.2" "s=0.3" "s=0.8"
## FALSE FALSE FALSE FALSE

## Models where empirical H_e is within CI for Hotspot_chr8_window_1 :
## character(0)
## 
## Models where empirical H_e is outside CI for Hotspot_chr8_window_1 :
## [1] "s=0.2" "s=0.3" "s=0.4" "s=0.8"

5.2 5th percentile of the extreme H_e values